In this vignette, I compare healthy and obese snRNA-seq data from human perivascular adipose tissue (PVAT). The two samples are taken from an original collection of three samples. We are only interested in two patients in this analysis:
There are 12,657 and 6,456 cells in samples 1 and 3 respectively that were sequenced on the Illumina NovaSeq 6000. The raw data can be found on the Gene Expression Omnibus website at this link.
The working directory can be set by running the following command in the terminal: setwd("/Users/hannahbazin/Desktop/Cambridge/Academics/Han_Lab/MPhil/mphil-project")
Because pathfindR includes a heuristic search algorithm to identify active subnetworks, we set a random seed for reproducibility.
# Set random seed for reproducibility
set.seed(42)
library(GEOquery)
## Loading required package: Biobase
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, table,
## tapply, union, unique, unsplit, which.max, which.min
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Setting options('download.file.method.GEOquery'='auto')
## Setting options('GEOquery.inmemory.gpl'=FALSE)
library(Seurat)
## Attaching SeuratObject
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:Biobase':
##
## combine
## The following objects are masked from 'package:BiocGenerics':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(patchwork)
library(ggplot2)
library(ggpubr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine() masks Biobase::combine(), BiocGenerics::combine()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(pathfindR)
## Loading required package: pathfindR.data
## ##############################################################################
## Welcome to pathfindR!
##
## Please cite the article below if you use pathfindR in published reseach:
##
## Ulgen E, Ozisik O, Sezerman OU. 2019. pathfindR: An R Package for Comprehensive
## Identification of Enriched Pathways in Omics Data Through Active Subnetworks.
## Front. Genet. doi:10.3389/fgene.2019.00858
##
## ##############################################################################
For consistency we reload the integrated Seurat object generated previously in the notebook titled human_PVAT_snRNAseq.Rmd. We then subset the object into one healthy and one obese object with the corresponding data.
# Load integrated Seurat object from previous analysis
load(file = "data/analysis/anno_combined.RData")
# Subset to samples 1 and 3
healthy <- subset(anno_combined, subset = sampleType == "GSM5068996")
obese <- subset(anno_combined, subset = sampleType == "GSM5068998")
It is helpful to visualise individual UMAP plots for the healthy and obese patients.
# Plot UMAP per sample
healthy_umap <- DimPlot(healthy, reduction = "umap") + ggtitle("Healthy Sample")
obese_umap <- DimPlot(obese, reduction = "umap") + ggtitle("Obese Sample")
# Save plots
pdf("results/humanPVATsn/pathfindR/comparison1v3/UMAP_healthy.pdf", width = 12, height = 6)
print(healthy_umap)
invisible(dev.off())
pdf("results/humanPVATsn/pathfindR/comparison1v3/UMAP_obese.pdf", width = 12, height = 6)
print(obese_umap)
invisible(dev.off())
# Show plot
healthy_umap
obese_umap
Subsetting a Seurat object may cause the loss of the scaled data in the object. We verify this and recompute the scaled data for both patients. This is a time-consuming step, so it is best to save the R objects.
# Ensure default assay is RNA assay
DefaultAssay(healthy) <- "RNA"
DefaultAssay(obese) <- "RNA"
# Check that scaled data was preserved - this can be lost while subsetting
#healthy@assays$RNA@scale.data[1:10, 1:10]
#obese@assays$RNA@scale.data[1:10, 1:10]
# Scale data for each subset separately - if needed
# This is a time-consuming step, save the object
if (file.exists("data/analysis/healthy_scaled.RData")) {
message("Loading existing healthy scaled data file.")
load(file = "data/analysis/healthy_scaled.RData")
} else {
message("Missing healthy scaled data file. Scaling data now.")
healthy_scaled <- ScaleData(healthy, vars.to.regress = "percent.mt")
save(healthy_scaled, file = "data/analysis/healthy_scaled.RData")
}
## Loading existing healthy scaled data file.
if (file.exists("data/analysis/obese_scaled.RData")) {
message("Loading existing obese scaled data file.")
load(file = "data/analysis/obese_scaled.RData")
} else {
message("Missing obese scaled data file. Scaling data now.")
obese_scaled <- ScaleData(obese, vars.to.regress = "percent.mt")
save(obese_scaled, file = "data/analysis/obese_scaled.RData")
}
## Loading existing obese scaled data file.
The FindAllMarkers function finds markers (i.e. differentially expressed genes) for each of the clusters in a dataset. By default, it uses a Wilcoxon Rank Sum test to identify differentially expressed genes between two groups. The function compares one cluster to all other clusters, therefore it may identify pathways reflecting differences between adipocytes and non-adipocytes rather than differences between adipocyte developmental stages.
For this reason, we use the FindMarkers function instead. This function is called to identify differentially expressed genes in adipocyte clusters by comparing them only to the three other adipocyte clusters. Seurat combines all the clusters in ident.2 into a single reference group.
The parameters are set as follows:
- min.pct = 0.3 only tests genes detected in 30% of either of the two populations being compared. This speeds up the function and allows us to focus on more biologically relevant genes.
- logfc.threshold = 0.3 limits testing to genes that show at least 0.3-fold difference (log-scale) between the two groups (removes weaker signals).
# Select clusters in adipocyte lineage
adipo_clusters <- c("Early pre-adipocytes", "Intermediate pre-adipocytes", "Transitional adipocytes", "Mature adipocytes")
healthy_adipo <- subset(healthy_scaled, idents = adipo_clusters)
obese_adipo <- subset(obese_scaled, idents = adipo_clusters)
# Define marker file paths
healthy_marker_files <- paste0("results/humanPVATsn/pathfindR/comparison1v3/healthy_markers_", gsub(" ", "_", tolower(adipo_clusters)), ".csv")
obese_marker_files <- paste0("results/humanPVATsn/pathfindR/comparison1v3/obese_markers_", gsub(" ", "_", tolower(adipo_clusters)), ".csv")
# Function to run FindMarkers for each cluster
run_find_markers <- function(object, condition, marker_files) {
# Identify missing files
missing_files <- marker_files[!file.exists(marker_files)]
if (length(missing_files) > 0) {
message("Missing marker files: ", paste(missing_files, collapse=", "))
}
# Only run FindMarkers if any marker file is missing
if (!all(file.exists(marker_files))) {
for (cluster in adipo_clusters) {
markers <- FindMarkers(
object = object,
ident.1 = cluster,
ident.2 = setdiff(adipo_clusters, cluster),
min.pct = 0.3, # Min fraction of cells expressing the genes
logfc.threshold = 0.3, # Min log fold change
verbose = TRUE
)
# Save CSV
marker_file <- paste0("results/humanPVATsn/pathfindR/comparison1v3/", condition, "_markers_", gsub(" ", "_", tolower(cluster)), ".csv")
write.csv(markers, marker_file, row.names = TRUE)
message(paste0("Saved markers for ", condition, " - ", cluster, " to ", marker_file))
}
} else {
message(paste0("All marker files already exist for ", condition, ". Skipping FindMarkers()."))
}
}
# Ensure default assay is RNA assay
DefaultAssay(healthy_adipo) <- "RNA"
DefaultAssay(obese_adipo) <- "RNA"
# Run FindMarkers for healthy and obese
run_find_markers(healthy_adipo, "healthy", healthy_marker_files)
## All marker files already exist for healthy. Skipping FindMarkers().
run_find_markers(obese_adipo, "obese", obese_marker_files)
## All marker files already exist for obese. Skipping FindMarkers().
We can compute the fraction of cells within the adipocyte lineage part of each differentiation step.
# Function to calculate cluster percentages of adipocyte lineage
get_cluster_percentages <- function(seurat_obj) {
subset_ident <- seurat_obj@active.ident[seurat_obj@active.ident %in% adipo_clusters] # Filter only relevant clusters
total_cells <- length(subset_ident)
percentages <- sapply(adipo_clusters, function(cluster) (sum(subset_ident == cluster) / total_cells) * 100)
return(percentages)
}
# Compute percentages for both Seurat objects
healthy_percentages <- get_cluster_percentages(healthy)
obese_percentages <- get_cluster_percentages(obese)
# Create final dataframe
result_df <- data.frame(
Cluster = adipo_clusters,
Healthy = healthy_percentages,
Obese = obese_percentages
)
# Save as CSV
write.csv(result_df, "results/humanPVATsn/pathfindR/comparison1v3/cluster_percentages.csv", row.names = FALSE)
# Print dataframe
result_df
## Cluster Healthy Obese
## Early pre-adipocytes Early pre-adipocytes 15.777653 6.562974
## Intermediate pre-adipocytes Intermediate pre-adipocytes 39.668725 30.576631
## Transitional adipocytes Transitional adipocytes 5.923638 2.541730
## Mature adipocytes Mature adipocytes 38.629983 60.318665
pathfindR analysispathfindR is an R package designed for active-subnetwork-oriented pathway enrichment analysis. Unlike traditional over-representation analysis (ORA) and gene set enrichment analysis (GSEA), which evaluate gene lists without considering gene interactions, pathfindR integrates protein-protein interaction networks (PINs) to identify active subnetworks – groups of interacting genes enriched in significantly altered genes. A subnetwork is defined as a cluster of interconnected genes in a PIN, and it is considered active if it contains a disproportionately high number of differentially expressed genes. The algorithm then performs pathway enrichment analysis on these subnetworks to identify biologically relevant pathways.
The pathfindR workflow consists of the following steps:
Load marker files for healthy patient.
if (all(file.exists(healthy_marker_files))) {
h_early <- read_csv(healthy_marker_files[1])
h_intermediate <- read_csv(healthy_marker_files[2])
h_transitional <- read_csv(healthy_marker_files[3])
h_mature <- read_csv(healthy_marker_files[4])
} else {
stop("One or more healthy marker files are missing. Run FindMarkers first.")
}
## New names:
## Rows: 1040 Columns: 6
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): ...1 dbl (5): p_val, avg_log2FC, pct.1, pct.2, p_val_adj
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## New names:
## Rows: 1107 Columns: 6
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): ...1 dbl (5): p_val, avg_log2FC, pct.1, pct.2, p_val_adj
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## New names:
## Rows: 204 Columns: 6
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): ...1 dbl (5): p_val, avg_log2FC, pct.1, pct.2, p_val_adj
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## New names:
## Rows: 1530 Columns: 6
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): ...1 dbl (5): p_val, avg_log2FC, pct.1, pct.2, p_val_adj
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
Load marker files for obese patient.
if (all(file.exists(obese_marker_files))) {
o_early <- read_csv(obese_marker_files[1])
o_intermediate <- read_csv(obese_marker_files[2])
o_transitional <- read_csv(obese_marker_files[3])
o_mature <- read_csv(obese_marker_files[4])
} else {
stop("One or more obese marker files are missing. Run FindMarkers first.")
}
## New names:
## Rows: 1525 Columns: 6
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): ...1 dbl (5): p_val, avg_log2FC, pct.1, pct.2, p_val_adj
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## New names:
## Rows: 1457 Columns: 6
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): ...1 dbl (5): p_val, avg_log2FC, pct.1, pct.2, p_val_adj
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## New names:
## Rows: 192 Columns: 6
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): ...1 dbl (5): p_val, avg_log2FC, pct.1, pct.2, p_val_adj
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## New names:
## Rows: 1507 Columns: 6
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): ...1 dbl (5): p_val, avg_log2FC, pct.1, pct.2, p_val_adj
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
The data needs to be reformatted to align with the expected input of pathfindR. The input data frame must consist of columns containing: gene symbols, change values (optional) and p values.
# Make function to reformat to pathfindR input
reformat_pathfindR <- function(input) {
input %>%
select(Gene.symbol = ...1, logFC = avg_log2FC, adj.P.Val = p_val_adj) %>%
as.data.frame()
}
# Reformat healthy markers
h_early <- reformat_pathfindR(h_early)
h_intermediate <- reformat_pathfindR(h_intermediate)
h_transitional <- reformat_pathfindR(h_transitional)
h_mature <- reformat_pathfindR(h_mature)
# Reformat obese markers
o_early <- reformat_pathfindR(o_early)
o_intermediate <- reformat_pathfindR(o_intermediate)
o_transitional <- reformat_pathfindR(o_transitional)
o_mature <- reformat_pathfindR(o_mature)
pathfindRFor early pre-adipocytes.
h_early_output <- run_pathfindR(h_early,
gene_sets = "GO-BP",
min_gset_size = 5,
max_gset_size = 300
)
## ## Testing input
## The input looks OK
## ## Processing input. Converting gene symbols,
## if necessary (and if human gene symbols provided)
## Number of genes provided in input: 1040
## Number of genes in input after p-value filtering: 976
## pathfindR cannot handle p values < 1e-13. These were changed to 1e-13
##
## Could not find any interactions for 60 (6.15%) genes in the PIN
## Final number of genes in input: 916
## ## Performing Active Subnetwork Search and Enrichment
## ## Processing the enrichment results over all iterations
## ## Annotating involved genes and visualizing enriched terms
## Plotting the enrichment bubble chart
## Found 160 enriched terms
## You may run:
## - cluster_enriched_terms() for clustering enriched terms
## - visualize_terms() for visualizing enriched term diagrams
For intermediate pre-adipocytes.
h_inter_output <- run_pathfindR(h_intermediate,
gene_sets = "GO-BP",
min_gset_size = 5,
max_gset_size = 300
)
## ## Testing input
## The input looks OK
## ## Processing input. Converting gene symbols,
## if necessary (and if human gene symbols provided)
## Number of genes provided in input: 1107
## Number of genes in input after p-value filtering: 965
## pathfindR cannot handle p values < 1e-13. These were changed to 1e-13
## Could not find any interactions for 78 (8.08%) genes in the PIN
## Final number of genes in input: 887
## ## Performing Active Subnetwork Search and Enrichment
## ## Processing the enrichment results over all iterations
## ## Annotating involved genes and visualizing enriched terms
## Plotting the enrichment bubble chart
## Found 172 enriched terms
## You may run:
## - cluster_enriched_terms() for clustering enriched terms
## - visualize_terms() for visualizing enriched term diagrams
For transitional adipocytes.
h_trans_output <- run_pathfindR(h_transitional,
gene_sets = "GO-BP",
min_gset_size = 5,
max_gset_size = 300
)
## ## Testing input
## The input looks OK
## ## Processing input. Converting gene symbols,
## if necessary (and if human gene symbols provided)
## Number of genes provided in input: 204
## Number of genes in input after p-value filtering: 73
## Could not find any interactions for 9 (12.33%) genes in the PIN
## Final number of genes in input: 64
## ## Performing Active Subnetwork Search and Enrichment
## ## Processing the enrichment results over all iterations
## ## Annotating involved genes and visualizing enriched terms
## Plotting the enrichment bubble chart
## Found 27 enriched terms
## You may run:
## - cluster_enriched_terms() for clustering enriched terms
## - visualize_terms() for visualizing enriched term diagrams
For mature adipocytes.
h_mature_output <- run_pathfindR(h_mature,
gene_sets = "GO-BP",
min_gset_size = 5,
max_gset_size = 300
)
## ## Testing input
## The input looks OK
## ## Processing input. Converting gene symbols,
## if necessary (and if human gene symbols provided)
## Number of genes provided in input: 1530
## Number of genes in input after p-value filtering: 1336
## pathfindR cannot handle p values < 1e-13. These were changed to 1e-13
## Could not find any interactions for 110 (8.23%) genes in the PIN
## Final number of genes in input: 1226
## ## Performing Active Subnetwork Search and Enrichment
## ## Processing the enrichment results over all iterations
## ## Annotating involved genes and visualizing enriched terms
## Plotting the enrichment bubble chart
## Found 207 enriched terms
## You may run:
## - cluster_enriched_terms() for clustering enriched terms
## - visualize_terms() for visualizing enriched term diagrams
For early preadipocytes.
o_early_output <- run_pathfindR(o_early,
gene_sets = "GO-BP",
min_gset_size = 5,
max_gset_size = 300
)
## ## Testing input
## The input looks OK
## ## Processing input. Converting gene symbols,
## if necessary (and if human gene symbols provided)
## Number of genes provided in input: 1525
## Number of genes in input after p-value filtering: 1016
## pathfindR cannot handle p values < 1e-13. These were changed to 1e-13
## Could not find any interactions for 54 (5.31%) genes in the PIN
## Final number of genes in input: 962
## ## Performing Active Subnetwork Search and Enrichment
## ## Processing the enrichment results over all iterations
## ## Annotating involved genes and visualizing enriched terms
## Plotting the enrichment bubble chart
## Found 158 enriched terms
## You may run:
## - cluster_enriched_terms() for clustering enriched terms
## - visualize_terms() for visualizing enriched term diagrams
For intermediate pre-adipocytes.
o_inter_output <- run_pathfindR(o_intermediate,
gene_sets = "GO-BP",
min_gset_size = 5,
max_gset_size = 300
)
## ## Testing input
## The input looks OK
## ## Processing input. Converting gene symbols,
## if necessary (and if human gene symbols provided)
## Number of genes provided in input: 1457
## Number of genes in input after p-value filtering: 1207
## pathfindR cannot handle p values < 1e-13. These were changed to 1e-13
## Could not find any interactions for 76 (6.3%) genes in the PIN
## Final number of genes in input: 1131
## ## Performing Active Subnetwork Search and Enrichment
## ## Processing the enrichment results over all iterations
## ## Annotating involved genes and visualizing enriched terms
## Plotting the enrichment bubble chart
## Found 245 enriched terms
## You may run:
## - cluster_enriched_terms() for clustering enriched terms
## - visualize_terms() for visualizing enriched term diagrams
For transitional adipocytes, the analysis is not possible because the obese patient only has 192 of these cells following quality control thresholds.
For mature adipocytes.
o_mature_output <- run_pathfindR(o_mature,
gene_sets = "GO-BP",
min_gset_size = 5,
max_gset_size = 300
)
## ## Testing input
## The input looks OK
## ## Processing input. Converting gene symbols,
## if necessary (and if human gene symbols provided)
## Number of genes provided in input: 1507
## Number of genes in input after p-value filtering: 1318
## pathfindR cannot handle p values < 1e-13. These were changed to 1e-13
## Could not find any interactions for 80 (6.07%) genes in the PIN
## Final number of genes in input: 1238
## ## Performing Active Subnetwork Search and Enrichment
## ## Processing the enrichment results over all iterations
## ## Annotating involved genes and visualizing enriched terms
## Plotting the enrichment bubble chart
## Found 269 enriched terms
## You may run:
## - cluster_enriched_terms() for clustering enriched terms
## - visualize_terms() for visualizing enriched term diagrams
We save these objects as RDS for downstream analysis, as this is a time-consuming process.
# For healthy patient
saveRDS(h_early_output, file = "data/analysis/pathfindR/h_early_output.rds")
saveRDS(h_inter_output, file = "data/analysis/pathfindR/h_inter_output.rds")
saveRDS(h_trans_output, file = "data/analysis/pathfindR/h_trans_output.rds")
saveRDS(h_mature_output, file = "data/analysis/pathfindR/h_mature_output.rds")
# For obese patient
saveRDS(o_early_output, file = "data/analysis/pathfindR/o_early_output.rds")
saveRDS(o_inter_output, file = "data/analysis/pathfindR/o_inter_output.rds")
saveRDS(o_mature_output, file = "data/analysis/pathfindR/o_mature_output.rds")
pathfindR resultsThe pathfindR package provides a function to compare two different pathfindR output dataframes. More details can be found in this vignette.
For early-pre-adipocytes.
# Combine results
combined_early <- combine_pathfindR_results(
result_A = h_early_output,
result_B = o_early_output,
plot_common = FALSE
)
## You may run `combined_results_graph()` to create visualizations of combined term-gene graphs of selected terms
For intermediate pre-adipocytes.
# Combine results
combined_intermediate <- combine_pathfindR_results(
result_A = h_inter_output,
result_B = o_inter_output,
plot_common = FALSE
)
## You may run `combined_results_graph()` to create visualizations of combined term-gene graphs of selected terms
For transitional adipocytes, the comparison is not possible because the obese patient does not show a sufficient quantity of transitional adipocyte cells, therefore no conclusions can be drawn.
For mature adipocytes.
# Combine results
combined_mature <- combine_pathfindR_results(
result_A = h_mature_output,
result_B = o_mature_output,
plot_common = FALSE
)
## You may run `combined_results_graph()` to create visualizations of combined term-gene graphs of selected terms
We save the files to streamline the workflow, allowing for quick access when rerunning only the visualisation chunks. In the resulting dataframes, “A only” represents pathways exclusive to the healthy group, while “B only” corresponds to pathways unique to the obese group.
write.csv(combined_early, "results/humanPVATsn/pathfindR/comparison1v3/early_pre_healthy_vs_obese.csv", row.names = TRUE)
write.csv(combined_intermediate, "results/humanPVATsn/pathfindR/comparison1v3/intermediate_pre_healthy_vs_obese.csv", row.names = TRUE)
write.csv(combined_mature, "results/humanPVATsn/pathfindR/comparison1v3/mature_healthy_vs_obese.csv", row.names = TRUE)
Load data and list patient conditions.
# Load data
early_df <- read_csv("results/humanPVATsn/pathfindR/comparison1v3/early_pre_healthy_vs_obese.csv")
## New names:
## Rows: 233 Columns: 19
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (7): ID, Term_Description, Up_regulated_A, Down_regulated_A, Up_regulat... dbl
## (12): ...1, Fold_Enrichment_A, occurrence_A, support_A, lowest_p_A, high...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
intermediate_df <- read_csv("results/humanPVATsn/pathfindR/comparison1v3/intermediate_pre_healthy_vs_obese.csv")
## New names:
## Rows: 305 Columns: 19
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (7): ID, Term_Description, Up_regulated_A, Down_regulated_A, Up_regulat... dbl
## (12): ...1, Fold_Enrichment_A, occurrence_A, support_A, lowest_p_A, high...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
mature_df <- read_csv("results/humanPVATsn/pathfindR/comparison1v3/mature_healthy_vs_obese.csv")
## New names:
## Rows: 341 Columns: 19
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (7): ID, Term_Description, Up_regulated_A, Down_regulated_A, Up_regulat... dbl
## (12): ...1, Fold_Enrichment_A, occurrence_A, support_A, lowest_p_A, high...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
# List of conditions
conditions <- c("A only", "B only", "common")
condition_labels <- list("A only" = "healthy only", "B only" = "obese only", "common" = "both healthy and obese")
Define function to create and save bar plots. - For the A only and B only pathways, we rank them based on fold enrichment. - For the common pathways, we rank them based on the combined p-value; this helps identify pathways that are consistently significant in both datasets.
create_and_save_bar_plot <- function(df, category, title_label, output_file) {
# Assign a color per condition
fill_col <- case_when(
category == "A only" ~ "#0072B2",
category == "B only" ~ "#E69F00",
category == "common" ~ "#009E73"
)
if(category == "common") {
sorting_col <- "combined_p"
df_filtered <- df %>%
filter(status == category) %>%
# Sort by descending combined p value
arrange(desc(combined_p)) %>%
# Keep only those with more than 5 occurrences
filter(occurrence_A > 5 & occurrence_B > 5) %>%
# Keep top 20 pathways for visualisation
slice_head(n = 20) %>%
# Ensure factor levels are correctly ordered for plotting
mutate(Term_Description = factor(Term_Description, levels = Term_Description))
# Adjust x-axis for "common" category to ensure longest bars at the top
x_var <- -log10(df_filtered[[sorting_col]]) # Convert p-values for better visualisation
x_label <- "-log10(combined P value)"
} else {
sorting_col <- paste0("Fold_Enrichment_", substr(category, 1, 1))
occurrence_col <- paste0("occurrence_", substr(category, 1, 1))
df_filtered <- df %>%
filter(status == category) %>%
# Sort by descending fold enrichment
arrange(desc(.data[[sorting_col]])) %>%
# Keep only those with more than 5 occurrences
filter(.data[[occurrence_col]] > 5) %>%
# Keep top 20 pathways for visualisation
slice_head(n = 20) %>%
# Ensure factor levels are correctly ordered for plotting
mutate(Term_Description = factor(Term_Description, levels = rev(Term_Description)))
x_var <- df_filtered[[sorting_col]]
x_label <- "Fold enrichment"
}
# Create bar plot
plot <- ggplot(df_filtered, aes(x = x_var, y = Term_Description)) +
geom_bar(stat = "identity", color = "black", fill = fill_col) +
labs(title = title_label, x = x_label,
y = "Pathway") +
theme_minimal() +
theme(legend.position = "none", plot.margin = margin(10, 20, 10, 10))
# Save plot
ggsave(output_file, plot, width = 12, height = 6, device = "pdf")
# Return plot
return(plot)
}
Generate plots for each differentiation stage: early pre-adipocytes, intermediate pre-adipocytes, and mature adipocytes.
for (condition in conditions) {
# Early adipocytes
early_plot <- create_and_save_bar_plot(early_df, condition, paste("Early pre-adipocytes: top 20 enriched pathways for", condition_labels[[condition]]), paste0("results/humanPVATsn/pathfindR/comparison1v3/early_", gsub(" ", "_", condition_labels[[condition]]), ".pdf"))
print(early_plot)
# Intermediate adipocytes
inter_plot <- create_and_save_bar_plot(intermediate_df, condition, paste("Intermediate pre-adipocytes: top 20 enriched pathways for", condition_labels[[condition]]), paste0("results/humanPVATsn/pathfindR/comparison1v3/intermediate_", gsub(" ", "_", condition_labels[[condition]]), ".pdf"))
print(inter_plot)
# Mature adipocytes
mature_plot <- create_and_save_bar_plot(mature_df, condition, paste("Mature adipocytes: top 20 enriched pathways for", condition_labels[[condition]]), paste0("results/humanPVATsn/pathfindR/comparison1v3/mature_", gsub(" ", "_", condition_labels[[condition]]), ".pdf"))
print(mature_plot)
}
The function cluster_enriched_terms clusters enriched pathways into biologically relevant umbrella terms. This is helpful for the downstream biological interpretation. It uses pairwise kappa statistics to perform hierarchical clustering of enriched terms to determine the representative term.
We load the previously saved RDS objects containing the pathfindR outputs.
# For healthy patient
h_early_output <- readRDS(file = "data/analysis/pathfindR/h_early_output.rds")
h_inter_output <- readRDS(file = "data/analysis/pathfindR/h_inter_output.rds")
h_trans_output <- readRDS(file = "data/analysis/pathfindR/h_trans_output.rds")
h_mature_output <- readRDS(file = "data/analysis/pathfindR/h_mature_output.rds")
# For obese patient
o_early_output <- readRDS(file = "data/analysis/pathfindR/o_early_output.rds")
o_inter_output <- readRDS(file = "data/analysis/pathfindR/o_inter_output.rds")
o_mature_output <- readRDS(file = "data/analysis/pathfindR/o_mature_output.rds")
# For pathways common to both patients (we will filter to keep only common pathways)
early_df <- read_csv("results/humanPVATsn/pathfindR/comparison1v3/early_pre_healthy_vs_obese.csv")
## New names:
## Rows: 233 Columns: 19
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (7): ID, Term_Description, Up_regulated_A, Down_regulated_A, Up_regulat... dbl
## (12): ...1, Fold_Enrichment_A, occurrence_A, support_A, lowest_p_A, high...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
intermediate_df <- read_csv("results/humanPVATsn/pathfindR/comparison1v3/intermediate_pre_healthy_vs_obese.csv")
## New names:
## Rows: 305 Columns: 19
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (7): ID, Term_Description, Up_regulated_A, Down_regulated_A, Up_regulat... dbl
## (12): ...1, Fold_Enrichment_A, occurrence_A, support_A, lowest_p_A, high...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
mature_df <- read_csv("results/humanPVATsn/pathfindR/comparison1v3/mature_healthy_vs_obese.csv")
## New names:
## Rows: 341 Columns: 19
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (7): ID, Term_Description, Up_regulated_A, Down_regulated_A, Up_regulat... dbl
## (12): ...1, Fold_Enrichment_A, occurrence_A, support_A, lowest_p_A, high...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
The three stage-specific dataframes need to be reformatted to serve as input to cluster_enriched_terms. The function requires the presence of the columns Term_Description, Down_regulated, and Up_regulated. The original dataframe has two columns for upregulated genes (Up_regulated_A and Up_regulated_B) and two columns for downregulated genes (Down_regulated_A and Down_regulated_B). I combine these two into one column named like Up_regulated and Down_regulated, which contains the upregulated and downregulated genes for both A and B combined. The lack of lowest_p column was causing errors so this was added to the dataframe as the lowest P value between A and B lowest P values, to highlight the best evidence. Similarly, for the added highest_p column, the highest value between the two patients was kept. For the added Fold_Enrichment and occurence columns, the mean between corresponding A and B columns was used to integrate data equally from both patients. However, for the added support column, which represents the fraction of significant input genes found to be involved in the pathway, the minimum was used to ensure the pathway is only seen as significant if well-supported in both patients. The remaining patient-specific columns are removed because they interfere with the cluster_enriched_terms function.
# Define function for reformatting
reformat_for_clustering <- function(df) {
res <- df %>%
# Keep only enriched pathways common to both patients
filter(status == "common") %>%
# Ensure operations are carried out on rows
rowwise() %>%
# Add necessary columns
mutate(
Fold_Enrichment = rowMeans(cbind(Fold_Enrichment_A, Fold_Enrichment_B), na.rm = TRUE),
occurrence = rowMeans(cbind(occurrence_A, occurrence_B)),
support = pmin(support_A, support_B, na.rm = TRUE),
lowest_p = pmin(lowest_p_A, lowest_p_B, na.rm = TRUE),
highest_p = pmax(highest_p_A, highest_p_B, na.rm = TRUE),
Up_regulated = {
parts <- c(Up_regulated_A, Up_regulated_B)
genes <- unlist(strsplit(paste(na.omit(parts), collapse = ", "), ",\\s*"))
if (length(genes) == 0) NA_character_ else paste(unique(genes), collapse = ", ")
},
Down_regulated = {
parts <- c(Down_regulated_A, Down_regulated_B)
genes <- unlist(strsplit(paste(na.omit(parts), collapse = ", "), ",\\s*"))
if (length(genes) == 0) NA_character_ else paste(unique(genes), collapse = ", ")
}
) %>%
# Revert the effect of rowwise()
ungroup() %>%
# Convert to data frame
as.data.frame() %>%
# Remove first column as it causes downstream errors and adds no meaningful information
select(-1) %>%
subset(select = -c(Fold_Enrichment_A, occurrence_A, support_A, lowest_p_A, highest_p_A, Up_regulated_A, Down_regulated_A,
Fold_Enrichment_B, occurrence_B, support_B, lowest_p_B, highest_p_B, Up_regulated_B, Down_regulated_B,
combined_p, status))
return(res)
}
# Reformat all three dataframes
early_df_filt <- reformat_for_clustering(early_df)
inter_df_filt <- reformat_for_clustering(intermediate_df)
mature_df_filt <- reformat_for_clustering(mature_df)
Cluster enriched terms. This will group enriched pathways into clusters, and each cluster will be given a certain number of “representative” pathways. It is common practice to filter this output as follows:
- Fold enrichment > 2: this is considered biologically meaningful, as the term is present at least twice as much as in the background
- Occurrence >= 9: this ensures only terms present in over 9 out of 10 pathfindR iterations are kept, for high confidence
- Lowest_p < 0.05: this ensures only statistically significant terms are kept
This function cluster_and_plot_pathways clusters enriched terms, filters representative pathways based on robustness criteria, visualises them as a bar plot, and saves the result.
cluster_and_plot_pathways <- function(pathfindR_output, output_prefix, title_label = NULL,
output_dir = "results/humanPVATsn/pathfindR/comparison1v3/",
save_plot = TRUE, save_csv = TRUE,
colour_group = "#2166AC") {
# Cluster enriched terms
clustered <- cluster_enriched_terms(pathfindR_output, plot_dend = FALSE, plot_clusters_graph = FALSE)
# Filter to keep only robust representative terms
clustered_rep <- clustered %>%
filter(Status == "Representative",
Fold_Enrichment > 2,
occurrence >= 9,
lowest_p < 0.05)
# Handle title if not provided
if (is.null(title_label)) {
title_label <- paste("Clusters of enriched pathways –", output_prefix)
}
# Visualise with a barplot
plot <- ggplot(clustered_rep,
aes(x = Fold_Enrichment, y = reorder(Term_Description, Fold_Enrichment))) +
geom_bar(stat = "identity", color = "black", fill = colour_group) +
labs(title = title_label,
x = "Fold enrichment",
y = "Pathway") +
theme_minimal() +
theme(legend.position = "none", plot.margin = margin(10, 20, 10, 10))
# Define output file paths
pdf_file <- file.path(output_dir, paste0(output_prefix, "_clustered.pdf"))
csv_file <- file.path(output_dir, paste0(output_prefix, "_clustered_filtered.csv"))
# Save outputs
if (save_plot) ggsave(pdf_file, plot, width = 12, height = 6, device = "pdf")
if (save_csv) write.csv(clustered_rep, csv_file, row.names = FALSE)
# Return table and plot
return(list(data = clustered_rep, plot = plot))
}
This function is called on all pathway enrichment analyses.
# Define inputs
pathfindR_outputs <- list(
early_healthy = h_early_output,
early_obese = o_early_output,
early_common = early_df_filt,
inter_healthy = h_inter_output,
inter_obese = o_inter_output,
inter_common = inter_df_filt,
trans_healthy = h_trans_output,
mature_healthy = h_mature_output,
mature_obese = o_mature_output,
mature_common = mature_df_filt
)
# Define matching labels and output names
run_labels <- list(
early_healthy = "Early pre-adipocytes: clusters of enriched pathways for healthy only",
early_obese = "Early pre-adipocytes: clusters of enriched pathways for obese only",
early_common = "Early pre-adipocytes: clusters of enriched pathways for both patients",
inter_healthy = "Intermediate pre-adipocytes: clusters of enriched pathways for healthy only",
inter_obese = "Intermediate pre-adipocytes: clusters of enriched pathways for obese only",
inter_common = "Intermediate pre-adipocytes: clusters of enriched pathways for both patients",
trans_healthy = "Transitional adipocytes: clusters of enriched pathways for healthy only",
mature_healthy = "Mature adipocytes: clusters of enriched pathways for healthy only",
mature_obese = "Mature adipocytes: clusters of enriched pathways for obese only",
mature_common = "Mature adipocytes: clusters of enriched pathways for both patients"
)
# Define plot colours
colour_lookup <- list(
early_healthy = "#0072B2",
early_obese = "#E69F00",
early_common = "#009E73",
inter_healthy = "#0072B2",
inter_obese = "#E69F00",
inter_common = "#009E73",
trans_healthy = "#0072B2",
mature_healthy = "#0072B2",
mature_obese = "#E69F00",
mature_common = "#009E73"
)
# Run all analyses
results_list <- list()
for (name in names(pathfindR_outputs)) {
# Call the function on each adipocyte stage
output <- cluster_and_plot_pathways(
pathfindR_output = pathfindR_outputs[[name]],
output_prefix = paste0(name),
title_label = run_labels[[name]],
colour_group = colour_lookup[[name]]
)
# Save results
results_list[[name]] <- output
# Print the plots
print(output$plot)
}
## The maximum average silhouette width was 0.18 for k = 40
## The maximum average silhouette width was 0.16 for k = 70
## The maximum average silhouette width was 0.16 for k = 40
## The maximum average silhouette width was 0.18 for k = 60
## The maximum average silhouette width was 0.15 for k = 100
## The maximum average silhouette width was 0.13 for k = 40
## The maximum average silhouette width was 0.37 for k = 10
## The maximum average silhouette width was 0.15 for k = 80
## The maximum average silhouette width was 0.16 for k = 100
## The maximum average silhouette width was 0.11 for k = 60
sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: aarch64-apple-darwin20
## Running under: macOS 15.3.2
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: Europe/London
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] pathfindR_2.4.1 pathfindR.data_2.1.0 lubridate_1.9.4
## [4] forcats_1.0.0 stringr_1.5.1 purrr_1.0.2
## [7] readr_2.1.5 tidyr_1.3.1 tibble_3.2.1
## [10] tidyverse_2.0.0 ggpubr_0.6.0.999 ggplot2_3.5.1
## [13] patchwork_1.3.0 dplyr_1.1.4 SeuratObject_4.1.4
## [16] Seurat_4.4.0 GEOquery_2.72.0 Biobase_2.64.0
## [19] BiocGenerics_0.50.0 BiocStyle_2.32.1
##
## loaded via a namespace (and not attached):
## [1] RcppAnnoy_0.0.22 splines_4.4.1 later_1.4.1
## [4] R.oo_1.27.0 polyclip_1.10-7 lifecycle_1.0.4
## [7] rstatix_0.7.2 doParallel_1.0.17 prabclus_2.3-4
## [10] globals_0.16.3 lattice_0.22-6 vroom_1.6.5
## [13] MASS_7.3-64 backports_1.5.0 magrittr_2.0.3
## [16] limma_3.60.6 plotly_4.10.4 sass_0.4.9
## [19] rmarkdown_2.29 jquerylib_0.1.4 yaml_2.3.10
## [22] httpuv_1.6.15 sctransform_0.4.1 flexmix_2.3-19
## [25] sp_2.2-0 spatstat.sparse_3.1-0 reticulate_1.40.0
## [28] DBI_1.2.3 cowplot_1.1.3 pbapply_1.7-2
## [31] RColorBrewer_1.1-3 zlibbioc_1.50.0 abind_1.4-8
## [34] Rtsne_0.17 R.utils_2.12.3 ggraph_2.2.1
## [37] nnet_7.3-20 tweenr_2.0.3 GenomeInfoDbData_1.2.12
## [40] IRanges_2.38.1 S4Vectors_0.42.1 ggrepel_0.9.6
## [43] irlba_2.3.5.1 listenv_0.9.1 spatstat.utils_3.1-2
## [46] goftest_1.2-3 spatstat.random_3.3-2 fitdistrplus_1.2-2
## [49] parallelly_1.42.0 leiden_0.4.3.1 codetools_0.2-20
## [52] xml2_1.3.6 ggforce_0.4.2 tidyselect_1.2.1
## [55] UCSC.utils_1.0.0 farver_2.1.2 viridis_0.6.5
## [58] stats4_4.4.1 matrixStats_1.5.0 spatstat.explore_3.3-4
## [61] jsonlite_1.8.9 tidygraph_1.3.1 progressr_0.15.1
## [64] Formula_1.2-5 ggridges_0.5.6 survival_3.8-3
## [67] iterators_1.0.14 systemfonts_1.2.1 foreach_1.5.2
## [70] tools_4.4.1 ragg_1.3.3 ica_1.0-3
## [73] Rcpp_1.0.14 glue_1.8.0 gridExtra_2.3
## [76] xfun_0.50 GenomeInfoDb_1.40.1 withr_3.0.2
## [79] BiocManager_1.30.25 fastmap_1.2.0 digest_0.6.37
## [82] timechange_0.3.0 R6_2.5.1 mime_0.12
## [85] textshaping_1.0.0 colorspace_2.1-1 scattermore_1.2
## [88] tensor_1.5 RSQLite_2.3.9 spatstat.data_3.1-4
## [91] diptest_0.77-1 R.methodsS3_1.8.2 generics_0.1.3
## [94] data.table_1.16.4 robustbase_0.99-4-1 class_7.3-23
## [97] graphlayouts_1.2.2 httr_1.4.7 htmlwidgets_1.6.4
## [100] uwot_0.2.2 pkgconfig_2.0.3 gtable_0.3.6
## [103] modeltools_0.2-23 blob_1.2.4 lmtest_0.9-40
## [106] XVector_0.44.0 htmltools_0.5.8.1 carData_3.0-5
## [109] bookdown_0.42 scales_1.3.0 png_0.1-8
## [112] spatstat.univar_3.1-1 knitr_1.49 rstudioapi_0.17.1
## [115] tzdb_0.4.0 reshape2_1.4.4 nlme_3.1-167
## [118] org.Hs.eg.db_3.20.0 cachem_1.1.0 zoo_1.8-12
## [121] KernSmooth_2.23-26 parallel_4.4.1 miniUI_0.1.1.1
## [124] AnnotationDbi_1.68.0 pillar_1.10.1 grid_4.4.1
## [127] vctrs_0.6.5 RANN_2.6.2 promises_1.3.2
## [130] car_3.1-3 xtable_1.8-4 cluster_2.1.8
## [133] evaluate_1.0.3 tinytex_0.54 magick_2.8.5
## [136] cli_3.6.3 compiler_4.4.1 crayon_1.5.3
## [139] rlang_1.1.5 future.apply_1.11.3 ggsignif_0.6.4
## [142] labeling_0.4.3 mclust_6.1.1 plyr_1.8.9
## [145] stringi_1.8.4 viridisLite_0.4.2 deldir_2.0-4
## [148] Biostrings_2.72.1 munsell_0.5.1 lazyeval_0.2.2
## [151] spatstat.geom_3.3-5 Matrix_1.7-2 hms_1.1.3
## [154] bit64_4.6.0-1 future_1.34.0 fpc_2.2-13
## [157] KEGGREST_1.44.1 statmod_1.5.0 shiny_1.10.0
## [160] kernlab_0.9-33 ROCR_1.0-11 igraph_2.1.4
## [163] broom_1.0.7 memoise_2.0.1 bslib_0.9.0
## [166] DEoptimR_1.1-3-1 bit_4.5.0.1